#Alexander F. Gazmararian
#afg2@princeton.edu
#January 9, 2024

#Purpose: This file merges together all data sets for analysis, and performs the final setup for the covariates

#Load packages
library(tidyverse)
library(tidylog)
library(here)

# Merge elections with census ----
# Load data
census <- readRDS(here("data", "inter", "census00.rds"))
pres <- readRDS(here("data", "inter", "pres_elec.rds"))
# Merge
g <- left_join(pres, census, by = "fips")
## Note: the two missing observations here cannot easily be handled
##    broomfield, colorado, splits from one county into three in 2001
##    south boston, virginia, is absorbed by another county
#     data does not include Puerto Rico nor Alaska nor DC
# anti_join(census, pres) %>% dplyr::select(fips)

# Merge with CBP ----
# Load CBP data
cbp <- readRDS(here("data", "inter", "cbp.rds"))
# Subset to only election years
cbp_merge <- subset(cbp, year %in% seq(1972,2020,4))
# Update FIPS to match merged counties---does not need to happen before "cbp_merge"
# This is for the pre-treatment share of coal employment below
cbp[cbp$fips==51515,]$fips<- 51019
cbp[cbp$fips==51560,]$fips<- 51083
cbp[cbp$fips==12025,]$fips <- 12086
cbp[cbp$fips==29193,]$fips <- 29186
cbp[cbp$fips==46113,]$fips <- 46102
cbp[cbp$fips==46131,]$fips <- 46071
# Aggregate data---does not need to happen before "cbp_merge"
cbp <- cbp %>%
  group_by(year,fips) %>%
  summarise(across(emp_all:oilgas, ~ sum(.x,na.rm=TRUE)))
# Merge
g <- left_join(g, cbp_merge, by = c("fips", "year"))
# Notes: Regarding missingness, data begin in 1975, so missing for 1972 election
##    Most missing obs. are Alaska, which we do not have consistent election data on; also do not include DC
# subset(g, is.na(emp_all) & year!=1972, select = c(county,state,fips,year))
# anti_join(cbp_merge, g) %>% dplyr::select(fips,year) %>% unique() %>% print(n=400)

# Merge with DTI ----
# Load data from 2006, prior to the shale shock
debt <- readRDS(here("data", "inter", "debt07.rds"))
# Merge
g <- left_join(g, debt, by = "fips")
# Notes: Regarding missingness, same as Alaska and DC obs from before except
##    clifton forge, which like south boston, gets absorbed into another county
# subset(g, is.na(debt06), select = c(county,state,fips)) %>% unique()
# anti_join(debt, g) %>% print(n=100)
#missing are alaska and DC

# Merge with adjacency data ----
# Load data
adj <- readRDS(here("data", "inter", "coal_adjacency.rds"))
# Merge data
g <- left_join(g, adj, by = c("fips"))
# Notes: missing are the non-adjacent counties as designed as well ass Alaska and DC
# subset(g, is.na(adjgroup), select = c(county,state,fips)) %>% unique()
# anti_join(adj, g, by = "fips") %>% print(n=100)

# Merge with break years ----
# Load data
break_years <- readRDS(here("data", "inter", "breaks.rds"))
# Merge
g <- left_join(g, break_years, by = "state")
# Notes: Missing is Alaska
# anti_join(break_years, g)

# Merge with EIA gas power plant data ----
# Load data, which begin in 1992---this is distinace to power plant data
eia <- readRDS(here("data", "inter", "eiagenerator", "eia_data_final.rds"))
# Merge
g <- left_join(g, eia, by = c("year", "fips"))
# Notes: only missing are broomfiled and south boston like before
#subset(g, is.na(NewGasDistance) & year>=1992, select = c(county,state,fips,year)) %>% unique()

# Merge with fuel prices data ----
# Load Data
fuelprices <- readRDS(here("data", "inter", "fuelprices.rds"))
# Merge
g <- left_join(g, fuelprices, by = "year")
# Notes: no missing data once accounting for the year
# subset(g, is.na(coalgas_ratio_z) & year>=1997, select = c(county,state,fips,year)) %>% unique()

# Merge data for falsification tests ----
# Load data
noncoal <- readRDS(here("data", "inter", "noncoalmines.rds"))
# Subset to counties and election years
noncoal <- subset(noncoal, !is.na(fips) & year%in%seq(2000,2020,4))
# Merge
g <- left_join(g, noncoal, by = c("year","fips"))
# Notes: missing are either Alaska or an erroneous admin code
# anti_join(noncoal, g, by = c("year", "fips")) %>% ungroup() %>% dplyr::select(fips) %>% unique() %>% print(n=100)

# Merge with union data ----
# Load data
union <- readRDS(here("data", "inter", "eia7a.rds"))
# Subset to relevant years
union <- subset(union,year%in%c(1972,1976,1980,1984,1988,1992,1996, 2000,2004,2008,2012,2016,2020))
# Merge data
g <- left_join(g, union, by = c("year", "fips"))
# Code missing as 0 because union data is the complete universe
g <- g %>% mutate(across(prod_nonunion:unionshare, ~ ifelse(is.na(.x) & year > 1983, 0, .x)))

# Account for FIPS changes----
# County unique FIPS over time to identify odd FIPS codes
g.freq <- g %>%
  group_by(state,county) %>%
  count()
table(g.freq$n)

# Convert "Shannon County"--> "Oglala Lakota County"
g[g$county=="shannon"&g$state=="South Dakota",]$county <- "oglala lakota"

#subset(g.freq,n==5)
#1 Colorado broomfield     5
#Colorado, 2001: Broomfield county (FIPS 8014) is created out of parts of Adams, Boulder, Jefferson, and Weld counties.
#The Census Bureau estimates that the resulting population loss was 21,512 for Boulder, 15,870 for Adams, 1,726 for Jefferson,
#and 69 for Weld county.

#no action to take here because cannot retrospectively disaggregate boulder,adams,jefferson,and weld

#subset(g.freq,n==6)
#1 Virginia south boston     6
#Virginia, 1995: The independent city of South Boston (FIPS 51780) merges into Halifax county (FIPS 51083).

#merge Halifax and South Boston
g[g$county=="south boston",]$fips <- 51083
g[g$county=="south boston",]$county <- "halifax"
g[g$county=="halifax"&g$state=="Virginia",]$adjgroup <- NA

#subset(g.freq,n==8)
#Virginia, 2001: The independent city of Clifton Forge (FIPS 51560) merges into Alleghany county (FIPS 51005).
g[g$county=="clifton forge",]$fips <- 51005
g[g$county=="clifton forge",]$county <- "alleghany"
g[g$county=="alleghany"&g$state=="Virginia",]$adjgroup <- "Distant Neighbor"

#subset(g.freq,n==10)
#1 Arizona    la paz    10
#Arizona, 1983: La Paz county (FIPS 4012) is created out of parts of Yuma county (FIPS 4027).
#like broomfield, cannot retrospectively create

#2 New Mexico cibola    10
#New Mexico, 1981: Cibola county (FIPS 35006) is created out of parts of Valencia county (FIPS 35061).
#like broomfield, cannot retrospectively create

#subset(g.freq,n==12)
#1 Virginia bedford city     12
#Virginia, 2013: The independent city of Bedford (FIPS 51515) merges into Bedford County (FIPS 51019)
g[g$county=="bedford city",]$fips <- 51019
g[g$county=="bedford city",]$county <- "bedford"

#subset(g,county=="bedford")
#2 Virginia manassas         12
#3 Virginia manassas park    12
#4 Virginia norfolk          12
#5 Virginia poquoson         12

# Aggregate data to account for these corrected FIPS codes
g <- g %>%
  group_by(county,year,state,fips,adjgroup,break_year) %>%
  summarise(
    across(c(TotalVotes,RepVotes,DemVotes,ThirdVotes,OtherVotes,pop:oilgas,noncoalmine_ironOre:unionshare), ~ sum(.x, na.rm = TRUE)),
    across(c(debt06,Gas:coalgas_ratio_z), ~ mean(.x, na.rm = TRUE))
  )
# Account for missingness of unions data
g <- g %>% mutate(across(c(union, unionshare), ~ ifelse(year <= 1983, NA, .x)))

#unify fips codes---but don't merge anything based on non-unified FIPS codes past this point
g[g$fips==12025,]$fips <- 12086
g[g$fips==29193,]$fips <- 29186
g[g$fips==46113,]$fips <- 46102

#Check remaining FIPS codes frequency 
g.freq <- g %>%
  group_by(state,county) %>%
  count()
table(g.freq$n)

# Code covariates ----
## Election outcomes ----
# Create share measures
g <- mutate(g, RepVotesMajorPercent = (RepVotes / (RepVotes + DemVotes)) * 100)
## Census ----
# Adjust to share
g <- g %>% mutate(across(c(white:poverty,hispanic:edu_somecollege_m), ~ .x / pop))
# Log population
g$poplog <- log(g$pop)
# Log income
g$income99pclog <- log(g$income99pc)
# Fix infinite values
g$poplog <- ifelse(is.infinite(g$poplog), NA, g$poplog)
g$income99pclog <- ifelse(is.infinite(g$income99pclog), NA, g$income99pclog)
##aggregate education and age measures
g <- g %>%
  mutate(
    under40 = age_under18 + age_19_29 + age_30_39,
    college = edu_ba_f + edu_grad_f + edu_ba_m + edu_grad_m
  )
## standardize census variables across state level
g00 <- subset(g, year == 2000, select = c(fips, state, pop:edu_somecollege_m, poplog, income99pclog, under40, college))
g00 <- g00 %>%
  group_by(state) %>%
  mutate(across(pop:college, .fns = list("s"=~ scale(.x)[,1]), .names = "{.col}_{.fn}"))
g00 <- subset(g00, select = -c(pop:college))
g <- left_join(g, g00, by = c("fips", "state"))
## Employment data ----
g <- g %>%
  ungroup() %>%
  mutate(
    oilgas_prop = case_when(emp_all != 0 ~ oilgas / emp_all, emp_all == 0 ~ 0, T ~ NA_real_),
    coal_prop = case_when(emp_all != 0 ~ coal / emp_all, emp_all == 0 ~ 0, T ~ NA_real_),
    power_prop = case_when(emp_all != 0 ~ power / emp_all, emp_all == 0 ~ 0, T ~ NA_real_)
  )
# Make sure pre 1975 data are coded as 0 because there are no employment data in that year
g <- g %>% mutate(across(oilgas_prop:power_prop, ~ ifelse(year<1975, NA, .x)))
# Convert to percentage
g <- g %>% mutate(across(oilgas_prop:power_prop, ~ .x * 100))
## Pre-treatment share of coal employment ----
# Create proportion
cbp <- cbp %>%
  mutate(
    oilgas_prop = case_when(emp_all != 0 ~ oilgas / emp_all, emp_all == 0 ~ 0),
    coal_prop = case_when(emp_all != 0 ~ coal / emp_all, emp_all == 0 ~ 0),
    power_prop = case_when(emp_all != 0 ~ power / emp_all, emp_all == 0 ~ 0)
  )
# Subset to 2000 data
cbp00 <- subset(cbp, year == 2000, select = c(fips, coal_prop))
# Create state FIPS code
cbp00$state <- substr(str_pad(cbp00$fips, 5, "left", pad = "0"), 0,2)
# Rename
names(cbp00)[2] <- "coal_prop_00"
# Standardize by county
cbp00$coal_prop_00_c <- scale(cbp00$coal_prop_00)[,1]
# Standardize by within-state variation
cbp00 <- cbp00 %>%
  group_by(state) %>%
  mutate(coal_prop_00_s = scale(coal_prop_00)[,1])
# Remove state variable
cbp00$state <- NULL
# Remerge with main data set
g <- left_join(g, cbp00, by = c("fips"))

## Pre-shock share of coal employment ----
# Create relevant subset
cbp0507 <- subset(cbp, year %in% c(2005,2006,2007), select = c(fips, coal_prop))
# Aggregate
cbp0507 <- cbp0507 %>%
  group_by(fips) %>%
  summarize(coal_prop_pre = mean(coal_prop,na.rm=TRUE))
# Standardize within county
cbp0507$coal_prop_pre_c <- scale(cbp0507$coal_prop_pre)[,1]
# Create state code
cbp0507$state <- substr(str_pad(cbp0507$fips, 5, "left", pad = "0"), 0,2)
# Standardize within state
cbp0507 <- cbp0507 %>%
  group_by(state) %>%
  mutate(coal_prop_pre_s = scale(coal_prop_pre)[,1])
# Remove state code
cbp0507$state <- NULL
# Add back to main data
g <- left_join(g, cbp0507, by = "fips")

## Indicator for coal employment loss ----
coal_chg <- cbp %>%
  group_by(fips) %>%
  mutate(
    elecyear = case_when(
      year >= 2017 & year <= 2020 ~ 2020,
      year >= 2013 & year <= 2016 ~ 2016,
      year >= 2009 & year <= 2012 ~ 2012,
      year >= 2005 & year <= 2008 ~ 2008,
      year >= 2001 & year <= 2004 ~ 2004,
      year >= 1997 & year <= 2000 ~ 2000,
      year >= 1993 & year <= 1996 ~ 1996,
      year >= 1989 & year <= 1992 ~ 1992,
      year >= 1985 & year <= 1988 ~ 1988,
      year >= 1981 & year <= 1984 ~ 1984,
      year >= 1977 & year <= 1980 ~ 1980,
      year >= 1973 & year <= 1976 ~ 1976,
      T ~ NA_real_
    )
  ) %>%
  group_by(fips,elecyear) %>%
  summarise(coal = mean(coal)) %>%
  arrange(fips,elecyear) %>%
  group_by(fips) %>%
  mutate(coal_chg = coal - dplyr::lag(coal))
# Subset to relevant variables
coal_chg <- subset(coal_chg, select = -c(coal))
coal_chg <- subset(coal_chg, !is.na(elecyear))
# Merge with main data
g <- left_join(g,coal_chg,by=c("fips"="fips","year"="elecyear"))
# Note: most missing are Alaska and district of columbia, then two cities in virginia
# anti_join(coal_chg,g,by=c("fips","elecyear"="year")) %>% dplyr::select(fips) %>% unique() %>% print(n=100)
# Create an indicator for if a county experienced a decline of coal employment between election years
g$coallayoff <- ifelse(g$coal_chg < 0, 1, 0)
# Create an indicator for if a county experienced greater than a one-standard deviation decline of coal employment between election years
g <- g %>%
  group_by(fips) %>%
  mutate(coalchgsd = sd(coal_chg,na.rm=TRUE),
         coallayoff1sd = ifelse(coal_chg < -coalchgsd, 1, 0))
# Create indicators for layoffs in which year
coallayoff_08 <- subset(g, coallayoff == 1 & year == 2008)$fips
coallayoff_12 <- subset(g, coallayoff == 1 & year == 2012)$fips
coallayoff_16 <- subset(g, coallayoff == 1 & year == 2016)$fips
coallayoff_20 <- subset(g, coallayoff == 1 & year == 2020)$fips
# Create treatment indicator
g <- g %>%
  ungroup() %>%
  mutate(
    coallayoff_treat = case_when(
      fips %in% coallayoff_08 & year >= 2008 ~ 1,
      fips %in% coallayoff_12 & year >= 2012 ~ 1,
      fips %in% coallayoff_16 & year >= 2016 ~ 1,
      fips %in% coallayoff_20 & year >= 2020 ~ 1,
      T ~ 0
    )
  )
# Create treatment indicator for more than a standard deviation increase in layoffs
coallayoff1sd_08 <- subset(g, coallayoff1sd == 1 & year == 2008)$fips
coallayoff1sd_12 <- subset(g, coallayoff1sd == 1 & year == 2012)$fips
coallayoff1sd_16 <- subset(g, coallayoff1sd == 1 & year == 2016)$fips
coallayoff1sd_20 <- subset(g, coallayoff1sd == 1 & year == 2020)$fips
g <- g %>%
  ungroup() %>%
  mutate(
    coallayoff1sd_treat = case_when(
      fips %in% coallayoff1sd_08 & year >= 2008 ~ 1,
      fips %in% coallayoff1sd_12 & year >= 2012 ~ 1,
      fips %in% coallayoff1sd_16 & year >= 2016 ~ 1,
      fips %in% coallayoff1sd_20 & year >= 2020 ~ 1,
      T ~ 0
    )
  )
# Create indicators for matching
g_treated <- unique(subset(g, coallayoff_treat == 1, select = fips))
g_treated1sd <- unique(subset(g, coallayoff1sd_treat == 1, select = fips))
g$treated_coallayoff <- ifelse(g$fips %in% g_treated$fips, 1, 0)
g$treated_coallayoff1sd <- ifelse(g$fips %in% g_treated1sd$fips, 1, 0)
## Unique county-state id ----
g$county_state <- paste0(g$county,g$state)

## Standardize variables ----
# Ensure that the variables are standardized with temporal variation according to relevant level
g <- g %>%
  # Estimate within-state variation
  group_by(state) %>%
  mutate(NewGasDistance_s = scale(NewGasDistance)[,1]) %>%
  # Estimate within-county variation
  group_by(county_state) %>%
  mutate(NewGasDistance_c = scale(NewGasDistance)[,1])

## Treatment indicators ----
# Create indicator for matching
g <- g %>%
  mutate(
    treat = case_when(
      adjgroup == "Coal" ~ 1,
      adjgroup %in% c("Distant Neighbor","Near Neighbor") ~ 0,
      T ~ NA_real_
    )
  )
# Create indicator for pooled dd with all units
g$coal_dd <- ifelse(is.na(g$treat), 0, g$treat)
# Indicators for post shock and coal county
g$coal_bin <- ifelse(g$adjgroup=="Coal", 1,0)
g$post_shock <- ifelse(g$year >= 2008, 1, 0)
# Treatment indicator for FEct
g$treated <- ifelse(g$year >= 2008 & g$treat == 1, 1, 0)
# Define treatment status so it is only in counties where coal is a big part of the local labor market
g <- g %>%
  ungroup() %>%
  mutate(
    emptreat = case_when(
      treated_coallayoff1sd == 1 & !is.na(adjgroup) ~ 1,
      T ~ 0
    )
  )
#create variable treatment onset indicator
g$treated_var <- ifelse(g$year >= g$break_year & g$treat == 1, 1, 0)
# Create sample indicator
g <- g %>%
  mutate(
    sample = case_when(
      adjgroup=="Coal"~"Coal",
      !is.na(adjgroup)~"Matching Pool",
      is.na(adjgroup)~"Rest of Country"
    )
  )
## General layoffs ----
layoff <- cbp %>%
  group_by(fips,year) %>%
  arrange() %>%
  group_by(fips) %>%
  mutate(emp_chg = emp_all - lag(emp_all),
         emp_chg_sd = sd(emp_chg, na.rm = TRUE))
# Construct indicator
layoff$layoff <- ifelse(layoff$emp_chg < -1 * layoff$emp_chg_sd, 1, 0)
# Subset to relevant variables
layoff <- subset(layoff, select = c(year,fips,layoff,emp_chg))
# Merge back with main data
g <- left_join(g, layoff, by = c("fips", "year"))
# Note: missing are from non-election years

## Non-coal mine falsification tests ----
### Anthracite mining ---- 
# Load data
ant <- readRDS(here("data", "inter", "cbp_ant.rds"))
# Subset to counties with more than 50 employees in anthracite mining post-2005
ant05post <- subset(ant, year >= 2005 & emp >= 50)
# Create indicator
g$ant_treat_emp <- ifelse(g$fips %in% ant05post$fips, 1, 0)

### Iron ore mining----
g$noncoalmine_ironOre <- with(g, ifelse(is.na(noncoalmine_ironOre), 0, noncoalmine_ironOre))
ironmines <- subset(g, select = c(fips, noncoalmine_ironOre), noncoalmine_ironOre == 1)
ironmines <- unique(ironmines)
names(ironmines)[2] <- "iron_falsify"
g <- left_join(g, ironmines, by = "fips")
g$iron_falsify <- with(g, ifelse(is.na(iron_falsify), 0, iron_falsify))

### silver mining----
g$noncoalmine_silverOre <- with(g, ifelse(is.na(noncoalmine_silverOre), 0, noncoalmine_silverOre))
silvermines <- subset(g, select = c(fips, noncoalmine_silverOre), noncoalmine_silverOre == 1)
silvermines <- unique(silvermines)
names(silvermines)[2] <- "silver_falsify"
g <- left_join(g, silvermines, by = "fips")
g$silver_falsify <- with(g, ifelse(is.na(silver_falsify), 0, silver_falsify))

### Sandstone mining----
g$noncoalmine_dimensionSandstone  <- with(g, ifelse(is.na(noncoalmine_dimensionSandstone), 0, noncoalmine_dimensionSandstone))
dimensionSandstone <- subset(g, select = c(fips, noncoalmine_dimensionSandstone), noncoalmine_dimensionSandstone == 1)
dimensionSandstone <- unique(dimensionSandstone)
names(dimensionSandstone)[2] <- "sandstone_falsify"
g <- left_join(g, dimensionSandstone, by = "fips")
g$sandstone_falsify <- with(g, ifelse(is.na(sandstone_falsify), 0, sandstone_falsify))

# Merge with national coal employment data ----
# This is for the shift-share analysis
# Aggregate data to national coal employment
cbpnat <- cbp %>%
  group_by(year) %>%
  summarize(coal = sum(coal, na.rm = TRUE),
            empall = sum(emp_all, na.rm = TRUE))
# Create national proportion of coal employment
cbpnat$nat_coal_prop <- with(cbpnat, coal / empall)
# Standardize
cbpnat$nat_coal_z <- scale(cbpnat$nat_coal_prop)[,1] * -1
# Rename variables
cbpnat <- rename(cbpnat, coal_nat = coal)
# Merge back with main data
g <- left_join(g, cbpnat, by = "year")

# Merge with ARC data----
# Load data
grants <- readRDS(here("data", "inter", "power_grants.rds"))
# Merge data
g <- left_join(g, grants, by = c("year", "fips"))
# Convert NA to 0 since we have the full universe of POWER grants
g <- g %>% mutate(across(funds_arc:grants_cum, ~ ifelse(is.na(.x), 0, .x)))

#Subset to relevant variables for analysis (as instructed by JOP)----
g <- subset(g, select = -c(ThirdVotes, OtherVotes, age_19_29:edu_somecollege_m,
                           indig, asian, pacific, raceother, racemore, work_male, urban, rural_farm,
                           Gas, work_male_s, indig_s, asian_s, pacific_s, raceother_s, racemore_s,
                           work_male_s, urban_s, rural_farm_s, age_19_29_s:edu_somecollege_m_s,
                           rural_s, coalchgsd, coallayoff1sd,coallayoff1sd_treat,treated_coallayoff1sd,
                           noncoalmine_ironOre, noncoalmine_silverOre, noncoalmine_dimensionSandstone, prod_nonunion, prod_union, rural
                           ))
  
#Save output----
saveRDS(g, here("data", "output", "pres_out.rds"))
